Main text figures
library(here)
source(here("0-config.R"))
#load(here(data_path, "swift_spatial_data.Rda"))
#ind_all <- read_csv(here(data_path, "ind_all.csv"))
clu_random_modeldata <- readRDS(file = here(data_path, "clu_random_modeldata_public.rds")) %>%
rename(cluster_id = cluster_id_public) %>%
# also add label for survey month
mutate(survey_label = paste0("Month ", survey))
# 0-5 data
clu_random_0to5 <- clu_random_modeldata %>%
filter(age_group == "0-5y")
clu_random_0to5_sf <- clu_random_0to5 %>%
st_as_sf(coords = c("lon", "lat"), crs = swift_crs)
# read in model results
sl_summary <- read_rds(file = here(output_path, "sl_summary_public111.rds"))
sl_results <- read_rds(file = here(output_path, "sl_results_public111.rds"))
cv_logistic_summary <- read_rds(file = here(output_path, "cv_logistic_summary_public111.rds"))
cv_logistic_results <- read_rds(file = here(output_path, "cv_logistic_results_public111.rds"))
# variables
n_permute_variog <- 1000 # set number of variogram null permutations
n_bs_cor <- 1000 # set number of correlation bootstrap replicates
save_figs <- FALSE
file_types <- c("png", "tiff")
Table 1
## Table 1: summary of sample sizes
## prepare descriptive statistics for Table 1, Table S1, Table S2 -----
desc_stats_df <- lapply(
c("pcr", "sero", "clin", "Pgp3", "Ct694", "tf", "ti"),
function(x){
prevalence_var <- paste0("prevalence_", x)
prevalence_lwriqr <- paste0("prevalence_", x, "_lwriqr")
prevalence_upriqr <- paste0("prevalence_", x, "_upriqr")
clu_random_modeldata %>%
group_by(survey, age_group) %>%
summarise(!!prevalence_lwriqr := quantile(get(prevalence_var), probs = 0.25, na.rm = TRUE),
!!prevalence_upriqr := quantile(get(prevalence_var), probs = 0.75, na.rm = TRUE),
!!prevalence_var := median(get(prevalence_var), na.rm = TRUE),
!!paste0("n_tested_", x) := sum(get(paste0("n_tested_", x))),
.groups = "drop") %>%
# format with exactly one digit after decimal
mutate_at(vars(starts_with("prevalence_")), list(~ sprintf(. * 100, fmt = '%#.1f'))) %>%
mutate(!!paste0("summary_", x) := paste0(get(prevalence_var), " (", get(prevalence_lwriqr), "-", get(prevalence_upriqr), ")")) %>%
dplyr::select(-starts_with("prevalence_"))
}) %>% purrr::reduce(left_join, by = c("survey", "age_group")) %>%
# add overall population count
# removed for public code as individual-level data not available
# left_join(ind_all %>%
# filter(random_sample == 1,
# (!is.na(clin_pos) | !is.na(sero_pos) | !is.na(pcr_individual) | !is.na(pcr_pool))) %>%
# group_by(survey, age_group) %>%
# count(name = "n_pop"),
# by = c("survey", "age_group")) %>%
# modify age group labels for table
mutate(age_group_label = case_when(
age_group == "0-5y" ~ "0to5",
age_group == "6-9y" ~ "6to9",
age_group == "10+y" ~ "over9"))
## prep df for Table 1 -----
table1_df <- desc_stats_df %>%
# drop any individuals not in 0-9yo
filter(age_group %in% c("0-5y", "6-9y")) %>%
# select final columns for table
#dplyr::select(survey, age_group_label, n_pop, summary_pcr, summary_clin, summary_sero) %>%
dplyr::select(survey, age_group_label, summary_pcr, summary_clin, summary_sero) %>%
pivot_wider(names_from = age_group_label,
#values_from = c("n_pop", "summary_pcr", "summary_clin", "summary_sero"))
values_from = c("summary_pcr", "summary_clin", "summary_sero"))
## create Table 1 with gt -----
table1 <- table1_df %>%
gt() %>%
# format n with commas
# fmt_number(
# columns = starts_with("n_pop"),
# decimals = 0
# ) %>%
tab_spanner(
label = "Median prevalence (%), 0-5 year olds (IQR)",
#columns = vars(n_pop_0to5, summary_pcr_0to5, summary_clin_0to5, summary_sero_0to5)
columns = vars(summary_pcr_0to5, summary_clin_0to5, summary_sero_0to5)
) %>%
tab_spanner(
label = "Median prevalence (%), 6-9 year olds (IQR)",
#columns = vars(n_pop_6to9, summary_pcr_6to9, summary_clin_6to9, summary_sero_6to9)
columns = vars(summary_pcr_6to9, summary_clin_6to9, summary_sero_6to9)
) %>%
cols_label(
survey = "Month",
#n_pop_0to5 = "n",
#n_pop_6to9 = "n",
summary_pcr_0to5 = "PCR",
summary_pcr_6to9 = "PCR",
summary_clin_0to5 = "Clinical TF/TI",
summary_clin_6to9 = "Clinical TF/TI",
summary_sero_0to5 = "Serology",
summary_sero_6to9 = "Serology"
) %>%
# table styling
fmt_missing(
columns = everything(),
missing_text = "-",
) %>%
cols_align(
align = "center",
columns = everything()
) %>%
tab_style(
style = cell_text(size = "small"),
locations = cells_body(
columns = everything()
)
) %>%
# add footnotes
# tab_footnote(
# footnote = "Number of children surveyed",
# locations = cells_column_labels(
# columns = starts_with("n_"))
# ) %>%
tab_footnote(
footnote = "Polymerase chain reaction",
locations = cells_column_labels(
columns = starts_with("summary_pcr_"))
) %>%
tab_footnote(
footnote = "Trachomatous inflammation - follicular / trachomatous inflammation - intense",
locations = cells_column_labels(
columns = starts_with("summary_clin_"))
) %>%
tab_footnote(
footnote = "Serology was not measured for a random sample of 6–9-year-olds at months 12 and 24",
locations = cells_column_labels(
columns = starts_with("summary_sero_"))
) %>%
tab_options(
footnotes.font.size = "x-small"
)
table1
| Month | Median prevalence (%), 0-5 year olds (IQR) | Median prevalence (%), 6-9 year olds (IQR) | ||||
|---|---|---|---|---|---|---|
| PCR1 | Clinical TF/TI2 | Serology3 | PCR1 | Clinical TF/TI2 | Serology3 | |
| 0 | 5.6 (2.9-18.1) | 62.9 (51.0-72.5) | 25.0 (10.1-34.8) | 3.5 (0.0-13.9) | 40.3 (25.9-54.9) | 49.2 (29.8-60.2) |
| 12 | 19.1 (6.6-30.2) | 50.8 (40.6-61.1) | 29.7 (15.6-40.2) | 10.9 (5.7-17.4) | 21.3 (14.3-27.8) | NA (NA-NA) |
| 24 | 27.4 (11.6-34.3) | 67.5 (55.5-77.4) | 33.3 (20.5-39.0) | 19.9 (9.7-34.2) | 45.1 (29.4-53.4) | NA (NA-NA) |
| 36 | 29.3 (16.2-46.8) | 56.7 (45.2-64.3) | 33.3 (23.5-42.3) | 21.7 (15.2-38.2) | 38.2 (30.1-53.6) | 50.8 (28.9-65.4) |
|
1
Polymerase chain reaction
2
Trachomatous inflammation - follicular / trachomatous inflammation - intense
3
Serology was not measured for a random sample of 6–9-year-olds at months 12 and 24
|
||||||
if(save_figs){
gtsave(table1,
filename = "table1_summary_stats.png",
path = here(output_path))
}
Figure 1
# will not run properly without correct spatial information
# read in ethiopia shape files
eth_shp_files <- st_read(here(data_path, "hdx", "eth_adm_csa_bofed_20201028_shp", "eth_admbnda_adm3_csa_bofed_20201027.shp"))
## Inset: map of Amhara in Ethiopia / Eastern Africa context -----
# pull africa country outlines
africa <- rnaturalearth::ne_countries(continent = "africa",
returnclass = "sf")
# create bounding box around ethiopia
eth_bbox <- st_bbox(africa %>% filter(admin == "Ethiopia"))
eth_bbox <- eth_bbox + c(-0.1, -0.15, 0.1, 0.1)
names(eth_bbox) <- c("left", "bottom", "right", "top")
eth_bbox
eth_bbox_sfc <- eth_bbox %>% st_bbox(crs = swift_crs) %>% st_as_sfc() # save version as sf polygon
eth_shp <- africa %>% filter(admin == "Ethiopia")
# create smoothed amhara outline
amhara_shp <- eth_shp_files %>%
filter(ADM1_EN == "Amhara") %>%
pull(geometry) %>%
st_union() %>%
st_cast("POLYGON") %>%
head(1) %>% # select largest polygon
smoothr::fill_holes(threshold = units::set_units(6000, km^2)) # remove outline of Lake Tana
# plot map
fig1_inset <- ggplot() +
#ggmap::get_stamenmap(eth_bbox, zoom = 7, maptype = "terrain") %>% ggmap() + # to add stamenmap background
#geom_sf(data = st_difference(eth_bbox_sfc, amhara_shp), fill = "grey", color = NA, alpha = 0.7, inherit.aes = FALSE) + # gray out non-ethiopia areas
#geom_sf(data = africa %>% filter(admin == "Ethiopia"), color = "darkgrey", fill = NA, lwd = 0.8) + # ethiopia country outline # 20210420 map
geom_sf(data = st_crop(africa, eth_bbox), color = "grey50", fill = NA, lwd = 0.8) + # ethiopia country outline
geom_sf(data = amhara_shp, color = "gray", fill = "gray", lwd = 0.6, alpha = 0.3) + # white out non-amhara area
# bounding box around swift area
geom_sf(data = swift_bbox %>% st_bbox(crs = swift_crs) %>% st_as_sfc(), color = "black", fill = NA, lwd = 1) +
geom_sf_text(data = africa %>% filter(admin == "Ethiopia"), aes(label = admin), alpha = 0.6, size = 4,
nudge_x = 1, nudge_y = -1.2, color = "black") + # add ethiopia label
coord_sf(crs = swift_crs, xlim = c(eth_bbox["left"], eth_bbox["right"]), ylim = c(eth_bbox["bottom"], eth_bbox["top"]), expand = FALSE) + # note: this code has to go AFTER geom_sf in order for expand = FALSE to work
theme_bw() +
theme(axis.text = element_blank(),
axis.ticks = element_blank(),
axis.title = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.background = element_blank(),
panel.border = element_rect(size = 1.2))
## Main map: study site with woreda outlines -----
swift_bbox_big <- swift_bbox + c(0,0,0.15,0) # make bbox larger to accommodate inset at top right
swift_bbox_big_sfc <- swift_bbox_big %>% st_bbox(crs = swift_crs) %>% st_as_sfc()
fig1_main <- ggmap::get_stamenmap(swift_bbox_big, maptype = "terrain") %>%
ggmap() +
coord_sf(crs = swift_crs) +
# white out non-swift areas
geom_sf(data = st_difference(swift_bbox_big_sfc, swift_woredas %>% pull(geometry) %>% st_union()),
fill = "white", color = NA, alpha = 0.5, inherit.aes = FALSE) +
# add woreda outlines
geom_sf(data = swift_woredas,
color = "black",
size = 0.7,
fill = NA,
inherit.aes = FALSE) +
# add cluster points with prevalence
geom_sf(data = cluster_sf, # original lat/lon
color = "black",
fill = "black",
alpha = 0.5,
pch = 21,
stroke = 1, # thickness of pch outline
size = 2,
inherit.aes = FALSE) +
# add woreda name labels
# respelled to match WUHA protocol paper
geom_sf_label(data = swift_woredas %>%
mutate(ADM3_label = case_when(
ADM3_EN == "Gaz Gibla" ~ "Gazgibella",
ADM3_EN == "Sekota" ~ "Sekota Zuria",
ADM3_EN == "Sekota town" ~ "Sekota Ketema"
)),
aes(label = ADM3_label),
alpha = 0.6,
size = 4.1,
inherit.aes = FALSE) +
# add map scale
ggsn::scalebar(cluster_sf,
dist = 5,
dist_unit = "km",
transform = TRUE,
model = "WGS84",
location = "bottomleft",
st.size = 3) +
theme_bw() +
# wrap legend text for fill
theme(axis.ticks = element_blank(),
axis.title = element_blank(),
axis.text = element_blank(),
axis.line = element_blank(),
panel.border = element_rect(size = 1.2))
fig1 <- ggdraw(fig1_main) + draw_plot(fig1_inset, x = 0.54, y = 0.68, width = 0.33, height = 0.33)
fig1
if(save_figs){
for (ft in file_types){
ggsave(filename = here(output_path, paste0("fig1_study_map.", ft)),
device = ft,
plot = fig1,
width = 7, height = 7)
}
}
Figure 2
## prepare for maps -----
# set grid size in degrees
# unit depends on coordinate system, in this case WGS 84
grid_size <- 33.19/3600 # 33.19 arc-seconds, ~1km at 12.51N latitude
# draw convex hull around clusters
swift_convex_hull <- cluster_sf %>% st_union() %>% st_convex_hull()
swift_hull_grid <- st_make_grid(swift_convex_hull,
cellsize = c(grid_size, grid_size),
crs = swift_crs,
what = "polygons",
square = FALSE)
## Panel A: Prevalence maps ----
# `get_grid_preds` function: get predictions using simple model (mean and Matern) for given variable
# input_grid: sf grid to get values over
# input_df: dataframe to use to build model
# input_var: outcome variable for model (e.g. "pcr")
# input_survey: survey month to build model for (e.g. 0, 12, 24 or 36)
get_grid_preds <- function(input_grid, input_df, input_var, input_survey){
pos_var <- paste0("n_pos_", input_var)
tested_var <- paste0("n_tested_", input_var)
temp_form <- paste0("cbind(", pos_var, ",", tested_var, "-", pos_var, ") ~ lat + lon + Matern(1|lat + lon)")
temp_mod <- spaMM::fitme(as.formula(temp_form),
data = input_df %>% filter(survey == input_survey),
family = binomial(link = "logit"))
temp_preds <- predict(object = temp_mod,
type = "response",
newdata = st_centroid(get(input_grid)) %>%
st_as_sf(crs = swift_crs) %>%
mutate(lat = sf::st_coordinates(x)[,2],
lon = sf::st_coordinates(x)[,1]) %>%
st_drop_geometry())
ret <- get(input_grid) %>%
as.data.frame %>%
st_as_sf(crs = swift_crs) %>%
mutate(trach_ind = paste0("prevalence_", input_var),
survey = input_survey,
pred_prevalence = temp_preds)
return(ret)
}
# get predictions over all survey_list for PCR
# will throw warning about centroids, but seems to work okay (maybe bc this area is close to equator?)
grid_preds_0to5_pcr <- mapply(get_grid_preds,
input_survey = survey_list,
MoreArgs = list(input_df = clu_random_modeldata %>% filter(age_group == "0-5y"),
input_var = "pcr",
input_grid = "swift_hull_grid"),
SIMPLIFY = FALSE) %>%
bind_rows() %>%
# change pred_prevalence to numeric
mutate(pred_prevalence = as.numeric(as.character(pred_prevalence))) %>%
mutate(survey_label = paste0("Month ", survey)) %>%
# st_make_grid uses rectangular bounding box, filter for grid cells that overlap convex hull
mutate(in_hull = st_intersects(., swift_convex_hull, sparse = FALSE)) %>%
filter(in_hull == TRUE) %>%
dplyr::select(-in_hull)
# plot cluster-level maps
# `get_grid_preds_map`
get_grid_preds_map <- function(input_preds, input_var, fill_label){
# for max fill, choose lowest value larger than max pred that is divisible by 0.2
obs_vals <- clu_random_0to5 %>% pull(get(paste0("prevalence_", input_var)))
pred_vals <- input_preds$pred_prevalence
# set max for color gradient bar
temp_fill_max <- ceiling(max(c(obs_vals, pred_vals))*10)
if((temp_fill_max %% 2) != 0){temp_fill_max <- temp_fill_max + 1}
temp_fill_max <- temp_fill_max/10
ret <- ggmap::get_map(swift_bbox, source = "stamen") %>%
ggmap() +
coord_sf(crs = swift_crs) +
## add woreda outlines
geom_sf(data = swift_woredas,
color = "white",
fill = NA,
inherit.aes = FALSE) +
## add smoothed predictions
geom_sf(data = input_preds,
aes(fill = pred_prevalence),
color = NA,
alpha = 0.9,
inherit.aes = FALSE) +
## add cluster points with prevalence
geom_sf(data = clu_random_modeldata %>% # use private data with real lat/lon
filter(age_group == "0-5y") %>%
st_as_sf(coords = c("lon", "lat"), crs = swift_crs),
aes(fill = get(paste0("prevalence_", input_var))),
color = "black",
pch = 21,
inherit.aes = FALSE) +
## add map scale
ggsn::scalebar(input_preds,
dist = 5,
dist_unit = "km",
transform = TRUE,
model = "WGS84",
location = "bottomleft",
st.size = 2,
facet.var = "survey_label") +
## formatting
scale_fill_gradientn(colors = viridis::viridis_pal()(9),
limits = c(0, temp_fill_max),
labels = scales::percent,
guide = guide_colorbar(frame.colour = "black",
frame.linewidth = 1.5,
ticks.colour = "black",
ticks.linewidth = 1.5)) +
facet_grid(cols = vars(survey_label)) +
theme_classic() +
# wrap legend text for fill
labs(fill = str_wrap(fill_label, width = 10)) +
theme(legend.position = "right",
legend.title = element_text(size = 10),
legend.text = element_text(size = 8),
axis.ticks = element_blank(),
axis.title = element_blank(),
axis.text = element_blank(),
axis.line = element_blank(),
plot.margin = unit(c(0,0,0,3), 'lines'))
return(ret)
}
fig2A <- get_grid_preds_map(input_preds = grid_preds_0to5_pcr,
input_var = "pcr",
fill_label = "PCR prevalence")
## Panel B: variograms for months 0-36 with MC envelopes and models -----
# create SpatialPointsDataFrame for clusters to allow for variogram building
clu_spatial <- clu_random_0to5_sf %>%
filter(survey == 0) %>%
st_as_sf(crs = swift_crs) %>%
as("Spatial")
# rule of thumb: limit max distance in variogram to half of maximum interpoint distance
clu_distances <- sapply(1:nrow(clu_spatial), function(x) geosphere::distGeo(p1 = clu_spatial, p2 = clu_spatial[x,])) # estimate distance from each point to all other points
half_max_dist <- max(clu_distances) / 2 / 1000 # divide by 1000 to turn meters to km (33.26 km)
## `get_variograms`: function to estimate variograms, fitted models, and monte carlo envelopes
# input_var: trachoma indicator to create variograms for (e.g. "pcr"); for standardization with other functions
# input_df: input dataframe
# input_models: list of models to fit to variogram
# n_permute: number of permutations for monte carlo envelope
# iterate through survey_list
# see Diggle and Ribiero 2006: 2.3.2 Spatial exploratory analysis
variog_colors <- c(#"Sph" = RColorBrewer::brewer.pal(n = 8, name = "Dark2")[7],
"Exp" = RColorBrewer::brewer.pal(n = 8, name = "Dark2")[4],
"Mat" = RColorBrewer::brewer.pal(n = 8, name = "Dark2")[5])
variogram_inside_loop <- function(s, input_var, input_df, input_models, n_permute, residualize){
# create temporary spatial dataframe for survey-specific variogram
temp_outcome <- paste0("prevalence_", input_var)
temp_spatial <- input_df %>%
filter(survey == s) %>%
st_as_sf(coords = c("lon", "lat"), remove = FALSE, crs = swift_crs) %>%
as("Spatial")
## get variogram model fits
# initialize containers to save variogram results
variog_lines <- data.frame(model = character(), dist = numeric(), gamma = numeric())
variog_stats <- data.frame(model = character(), sserr = numeric(), effrange = numeric())
# due to stationarity assumptions, have to residualize trends prior to variograms and moran's I
if(residualize){
temp_spatial$trend_resid <- resid(lm(get(temp_outcome)~lat+lon, data = temp_spatial))
temp_outcome <- "trend_resid"
}
for(temp_mod in input_models){
# note that automap creates fewer, but larger bins than gstat (gstat results were much more unstable)
# note: `dist` in results is the average distance of all pairs in that bin (see `gstat` documentation)
temp_fit <- automap::autofitVariogram(formula = get(temp_outcome)~1,
input_data = temp_spatial,
model = temp_mod, # ability to fit multiple models at once is deprecated?
kappa = c(1.5, 2.5, 3.5, 5, 10, 15, 20), # only used for Matern
# miscFitOptions = list(merge.small.bins = TRUE) # alternative way to create larger bins
miscFitOptions = list(min.np.bin = 10)) # automap has this feature, gstat does not
temp_var_model <- temp_fit$var_model
temp_line <- gstat::variogramLine(temp_var_model, maxdist = half_max_dist)
variog_lines <- variog_lines %>% bind_rows(temp_line %>% mutate(model = temp_mod))
temp_effrange_y <- temp_var_model[which(temp_var_model$model == temp_mod), "psill"]*0.95 + temp_var_model[which(temp_var_model$model == "Nug"), "psill"]
temp_effrange <- temp_line$dist[min(which(temp_line$gamma > temp_effrange_y))] # estimate effective range as 95% of sill + nugget
variog_stats <- variog_stats %>% bind_rows(data.frame(model = temp_mod, sserr = temp_fit$sserr, effrange = temp_effrange))
}
## estimate Monte Carlo envelope
# get empirical variogram for observed data
permute_results <- automap::autofitVariogram(formula = get(temp_outcome)~1,
input_data = temp_spatial,
miscFitOptions = list(min.np.bin = 10))$exp_var %>%
dplyr::select(np, dist, gamma) %>%
as.data.frame()
# get moran's I
temp_outcome_morani <- paste0("prevalence_", input_var)
temp_distances <- sapply(1:nrow(temp_spatial), function(x) geosphere::distGeo(p1 = temp_spatial, p2 = temp_spatial[x,]))
temp_dist_wts <- 1/temp_distances
diag(temp_dist_wts) <- 0
temp_ape_morani <- ape::Moran.I(temp_spatial@data[,temp_outcome_morani], temp_dist_wts)
#print(temp_ape_morani)
permute_results_morani <- c()
# conduct permutations
for(i in 1:n_permute){
# choose order of 40 clusters
temp_permute <- sample(1:40, size = 40, replace = FALSE)
temp_permute_spatial <- temp_spatial@data %>%
dplyr::select(cluster_id, eval(temp_outcome), eval(temp_outcome_morani)) %>%
# rearrange order of PCR values
mutate(temp_permute = temp_permute) %>%
arrange(temp_permute) %>%
# tack on spatial data in original order
bind_cols(input_df %>% filter(survey == s) %>% dplyr::select(cluster_id_sf = cluster_id, lat, lon)) %>%
# make into Spatial data
st_as_sf(coords = c("lon", "lat"), crs = swift_crs, remove = FALSE) %>%
as("Spatial")
# fit variogram
temp_variog <- automap::autofitVariogram(formula = get(temp_outcome)~1,
input_data = temp_permute_spatial,
miscFitOptions = list(min.np.bin = 10))
permute_results <- permute_results %>%
left_join(temp_variog$exp_var %>%
dplyr::select(np, dist, gamma) %>%
rename(!!paste0("gamma_", i) := gamma),
by = c("np", "dist"))
permute_results_morani[i] <- ape::Moran.I(temp_permute_spatial@data[,temp_outcome_morani], temp_dist_wts)$observed
}
# get upper and lower bounds for envelope
permute_bounds <- apply(permute_results %>% dplyr::select(starts_with("gamma")), 1, quantile, c(0.025, 0.975)) %>%
t() %>%
as.data.frame() %>%
rename(q0.025 = "2.5%", q0.975 = "97.5%") %>%
mutate(fill_flag = "1")
permute_results <- permute_results %>% bind_cols(permute_bounds)
## moran i inset
permute_bounds_morani <- quantile(c(permute_results_morani, temp_ape_morani$observed), c(0.025, 0.975))
morani_p <- round((sum(permute_results_morani>=temp_ape_morani$observed)+1)/(n_permute + 1), digits = 2)
temp_morani_fig <- ggplot(data = data.frame(morani_obs = c(permute_results_morani, temp_ape_morani$observed))) +
geom_histogram(aes(x = morani_obs), bins = 30, color = "white", fill = "grey85") +
geom_vline(aes(xintercept = temp_ape_morani$observed), color = "red") +
coord_cartesian(xlim = c(-0.2, 0.2), ylim = c(0, (n_permute + 1)/5)) +
labs(x = "Moran's I", y = "Count") +
theme_classic() +
theme(title = element_text(size = 9))
## create variogram plot of points and each model fit
temp_plot <- ggplot() +
geom_ribbon(data = permute_results, aes(x = dist, ymin = q0.025, ymax = q0.975, fill = fill_flag), alpha = 0.3) +
geom_point(data = permute_results, aes(x = dist, y = gamma)) + # empirical variog, does not depend on model
geom_text(data = permute_results, aes(x = dist, y = 0.06*0.95, label = np), size = 2) + # empirical variog, does not depend on model
geom_line(data = variog_lines, aes(x = dist, y = gamma, color = model)) +
geom_vline(data = variog_stats, aes(xintercept = effrange, color = model), lty = "dashed", show.legend = FALSE) +
scale_color_manual(values = variog_colors, labels = c("Exp" = "Exponential", "Mat" = "Matern")) +
scale_fill_manual(values = c("1" = "grey"), labels = c("1" = "Monte Carlo\nenvelope")) +
labs(#title = paste("variogram for", curr_ind, "- survey month", s),
#subtitle = "labels above each point are bin sizes",
x = "Distance (km)", y = "Semivariance",
color = "Fitted model",
fill = "") +
coord_cartesian(xlim = c(0, max(permute_results$dist)), y = c(0, 0.062)) +
theme_classic() +
theme(title = element_text(size = 9),
panel.grid.major = element_line(color = "grey95"),
legend.position = "none") +
guides(fill = guide_legend(order = 2),
color = guide_legend(order = 1))
if(s != 0){
temp_plot <- temp_plot +
theme(axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
axis.line.y = element_blank())
temp_morani_fig <- temp_morani_fig +
theme(axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
axis.line.y = element_blank())
}
if(s == 0){
temp_plot <- temp_plot +
geom_text(aes(x = 8, y = 0.062), label = "Bin sample sizes", size = 2.8) +
theme(plot.margin = unit(c(0.3,0,0.3,1), 'lines'))
temp_morani_fig <- temp_morani_fig +
annotate("text", x = -0.14, y = (n_permute + 1)/5*0.95, size = 3,
label = paste0("p = ", format(morani_p, nsmall = 2))) +
theme(plot.margin = unit(c(0,0,0.4,1), 'lines'))
}
if(s == 36){
temp_plot <- temp_plot +
theme(legend.position = "right",
legend.title = element_text(size = 9),
legend.text = element_text(size = 8))
temp_morani_fig <- temp_morani_fig +
annotate("text", x = -0.14, y = (n_permute + 1)/5*0.97, size = 3,
label = paste0("p = ", format(morani_p, nsmall = 2))) +
theme(plot.margin = unit(c(0,5.5,0.4,0), 'lines'))
}
if(s %ni% c(0,36)){
temp_morani_fig <- temp_morani_fig +
annotate("text", x = -0.14, y = (n_permute + 1)/5, size = 3,
label = paste0("p = ", format(morani_p, nsmall = 2)))
}
return(list(temp_plot, temp_morani_fig))
}
get_variograms <- function(input_var, input_df, input_models, n_permute, residualize){
variog_plot_list <- list() # initialize container to save plots
morani_plot_list <- list()
set.seed(111)
applied_plots <- lapply(survey_list,
function(x){
variogram_inside_loop(
s = x,
input_var = input_var,
input_df = input_df,
input_models = input_models,
n_permute = n_permute,
residualize = residualize
)})
variog_plot_list <- applied_plots %>% purrr::map(1)
morani_plot_list <- applied_plots %>% purrr::map(2)
variog_plot <- arrangeGrob(grobs = variog_plot_list, nrow = 1, widths = c(1.2,1,1,1.6))
morani_plot <- arrangeGrob(grobs = morani_plot_list, nrow = 1, widths = c(1.2,1,1,1.6))
return(list(variog_plot, morani_plot))
}
# ~10 minutes with 1000 permutations
fig2BC <- get_variograms(input_var = "pcr",
input_df = clu_random_0to5,
input_models = c("Exp", "Mat"),
n_permute = n_permute_variog,
residualize = TRUE)
## Put panels together
fig2 <- plot_grid(#plotlist = list(fig2A, fig2BC[[1]], fig2BC[[2]]),
plotlist = list(fig2BC[[1]], fig2BC[[2]]),
vjust = c(1.5, 1.5),
hjust = c(-0.5, -0.5),
ncol = 1,
labels = "AUTO",
axis = "r",
rel_heights = c(1,1),
label_size = 14)
fig2
if(save_figs){
for (ft in file_types){
ggsave(filename = here(output_path, paste0("fig2_pcr_map_variog.", ft)),
device = ft,
plot = fig2,
width = 9, height = 4, units = "in")
}
}
Figure 3
## calculate correlations and bootstrap CIs -----
cor_method <- "spearman"
cor_ci_df <- expand.grid(curr_ind = c(trach_ind, "prevalence_Pgp3", "prevalence_Ct694", "prevalence_ti", "prevalence_tf"),
curr_survey = survey_list,
lag = survey_list) %>%
crossing(age_group = c("0-5y", "6-9y")) %>%
# only keep lags for survey 36
filter(curr_survey == 36 | lag == 0) %>%
filter(!(age_group == "6-9y" & (curr_survey-lag) %in% c(12,24) & curr_ind %in% c("prevalence_Pgp3", "prevalence_Ct694", "prevalence_sero"))) %>%
# initialize columns for correlation & ci values
mutate(cor_est = NA, cor_lwrci = NA, cor_uprci = NA)
# get correlation for bootstrapped sample
get_bs_cor <- function(temp_xy){
temp_sample <- temp_xy[sample(1:nrow(temp_xy), size = nrow(temp_xy), replace = TRUE), ]
temp_bs_cor <- cor(temp_sample$y, temp_sample$x, method = cor_method, use = "complete.obs") # calculate correlation on complete cases
return(temp_bs_cor)
}
for(i in 1:nrow(cor_ci_df)){
# pull desired x values based on variable, survey and lag
temp_x <- clu_random_modeldata %>%
filter(age_group == cor_ci_df$age_group[i]) %>%
filter(survey == (cor_ci_df$curr_survey[i] - cor_ci_df$lag[i])) %>%
dplyr::select(cluster_id, x = eval(cor_ci_df$curr_ind[i]))
# pull desired y value based on survey
# pcr is the "outcome" of interest in all cases
temp_y <- clu_random_modeldata %>%
filter(age_group == cor_ci_df$age_group[i]) %>%
filter(survey == cor_ci_df$curr_survey[i]) %>%
dplyr::select(cluster_id, y = prevalence_pcr)
# merge x and y values into one dataset
temp_xy <- left_join(temp_x, temp_y, by = "cluster_id")
set.seed(111)
temp_bs_cor <- sapply(1:n_bs_cor, function(x) get_bs_cor(temp_xy = temp_xy))
# calculate correlation and bounds
temp_cor <- cor(temp_xy$y, temp_xy$x, method = cor_method, use = "complete.obs")
cor_ci_df$cor_est[i] <- temp_cor
cor_ci_df$cor_lwrci[i] <- quantile(c(temp_bs_cor), prob = 0.025)
cor_ci_df$cor_uprci[i] <- quantile(c(temp_bs_cor), prob = 0.975)
}
## figure styling ------
# set colors for age groups
age_group_colors <- c("0-5y" = RColorBrewer::brewer.pal(n = 8, name = "Dark2")[1],
"6-9y" = RColorBrewer::brewer.pal(n = 8, name = "Dark2")[3])
fig3AB_legend_df <- data.frame(survey_label = c("Study month 36"), age_group = c("0-5y", "6-9y")) %>%
mutate(survey_label = factor(survey_label, levels = c("Study month 0", "Study month 36"))) %>%
mutate(age_group = factor(age_group, levels = c("0-5y", "6-9y")))
# common styling for panels A and B
fig3AB_theme <- list(facet_wrap(.~survey_label, nrow = 1),
scale_x_continuous(breaks = seq(0, 1, by = 0.25), labels = sprintf(100*seq(0,1,by=0.25), fmt = "%1.0f")),
scale_y_continuous(breaks = seq(0, 1, by = 0.25), labels = sprintf( 100*seq(0,1,by=0.25), fmt = "%1.0f")),
coord_cartesian(xlim = c(0,1), ylim = c(0,1)),
scale_color_manual(values = age_group_colors),
theme_classic(),
theme(#legend.position = "none",
legend.position = c(0.93, 0.16),
legend.title = element_text(size = 7),
legend.text = element_text(size = 6),
#legend.spacing.y = unit(2, 'points'),
legend.key.height = unit(0.26, 'cm'),
legend.key.width = unit(0.35, 'cm'),
panel.grid.major = element_line(color = "grey95"),
axis.text = element_text(size = 7)))
## Panel A: correlation between seroprevalence and PCR prevalence at 0 and 36 -----
fig3A_cor <- cor_ci_df %>%
filter(curr_survey %in% c(0,36), lag == 0, curr_ind == "prevalence_sero") %>%
mutate_at(vars(starts_with("cor")), ~sprintf(., fmt = '%#.2f')) %>%
mutate(survey_label = paste0("Study month ", curr_survey))
fig3A <- clu_random_modeldata %>%
filter(age_group != "10+y", survey %in% c(0, 36)) %>%
mutate(survey_label = paste0("Study month ", survey)) %>%
ggplot(aes(x = prevalence_sero, y = prevalence_pcr, color = age_group)) +
geom_point(size = 1, alpha = 0.7) +
geom_smooth(method = 'lm', formula = y~x, se = F, size = 0.5, lty = "dashed") +
geom_text(data = fig3A_cor, aes(x = 0.3, y = 1 - as.numeric(as.factor(age_group))*0.08, color = age_group,
label = paste0("\u03C1 = ", cor_est, " (", cor_lwrci, ",", cor_uprci, ")")),
size = 3, show.legend = FALSE) + # use unicode to represent rho
labs(x = "Seroprevalence (%)", y = "PCR prevalence (%)", color = "Age group") +
fig3AB_theme
## Panel B: correlation between clinical trachoma and PCR prevalence at 0 and 36 ------
fig3B_cor <- cor_ci_df %>%
filter(curr_survey %in% c(0,36), lag == 0, curr_ind == "prevalence_clin") %>%
mutate_at(vars(starts_with("cor")), ~sprintf(., fmt = '%#.2f')) %>%
mutate(survey_label = paste0("Study month ", curr_survey))
fig3B <- clu_random_modeldata %>%
filter(age_group != "10+y", survey %in% c(0, 36)) %>%
mutate(survey_label = paste0("Study month ", survey)) %>%
ggplot(aes(x = prevalence_clin, y = prevalence_pcr, color = age_group)) +
geom_point(size = 1, alpha = 0.7) +
geom_smooth(method = 'lm', formula = y~x, se = F, size = 0.5, lty = "dashed") +
geom_text(data = fig3B_cor, aes(x = 0.3, y = 1 - as.numeric(as.factor(age_group))*0.08, color = age_group,
label = paste0("\u03C1 = ", cor_est, " (", cor_lwrci, ",", cor_uprci, ")")),
size = 3, show.legend = FALSE) + # use unicode to represent rho
labs(x = "TF/TI prevalence (%)", y = "PCR prevalence (%)", color = "Age group") +
fig3AB_theme
## Panel C: correlation between trachoma indicators with lag -----
# filter for relevant rows of correlation and CI df
fig3C_df <- cor_ci_df %>%
filter(curr_survey == 36) %>%
filter(curr_ind %in% trach_ind)
fig3C <- fig3C_df %>%
mutate_at(vars(starts_with("cor")), ~replace(., age_group == "6-9y" & curr_ind == "prevalence_sero" & lag %in% c(12, 24), -1)) %>%
mutate(curr_ind_label = case_when(curr_ind == "prevalence_sero" ~ "Serology",
curr_ind == "prevalence_pcr" ~ "PCR",
curr_ind == "prevalence_clin" ~ "TF/TI")) %>%
# reorder factors
mutate(curr_ind_label = factor(curr_ind_label, levels = c("PCR", "TF/TI", "Serology"))) %>%
# NEW for x
mutate(x_survey = as.factor(curr_survey - lag)) %>%
mutate(lag = factor(lag, levels = survey_list)) %>%
mutate(lag_label = lag) %>%
# plot
ggplot(aes(x = x_survey, y = cor_est, group = interaction(x_survey, age_group), color = age_group)) +
geom_point(size = 2, position = position_dodge(width = 0.4), alpha = 0.7) +
geom_errorbar(aes(x = x_survey, y = cor_est, ymin = cor_lwrci, ymax = cor_uprci, color = age_group),
width = 0, position = position_dodge(width = 0.4)) +
scale_color_manual(values = age_group_colors) +
facet_wrap(.~curr_ind_label, nrow = 1) +
labs(x = "Month of trachoma indicator measurement",
y = str_wrap("Spearman correlation with month 36 PCR prevalence", width = 30),
color = "Age group", pch = "Trachoma measure") +
scale_y_continuous(breaks = seq(-1, 1, 0.2)) +
coord_cartesian(ylim = c(-0.2, 1)) +
theme_classic() +
theme(legend.position = c(0.96,0.17),
legend.title = element_text(size = 7),
legend.text = element_text(size = 6),
legend.key.width = unit(0.35, 'cm'),
legend.key.height = unit(0.30, 'cm'),
panel.grid.major = element_line(color = "grey95"),
axis.text.y = element_text(size = 7))
## Stitch panels together -----
fig3_top <- plot_grid(plotlist = list(fig3A, fig3B), labels = c('A', 'B'), label_size = 14, nrow = 1)
fig3_bottom <- plot_grid(plotlist = list(fig3C), labels = c('C'), label_size = 14, nrow = 1)
fig3 <- plot_grid(fig3_top, fig3_bottom, ncol = 1)
fig3
if(save_figs){
for (ft in file_types){
ggsave(filename = here(output_path, paste0("fig3_obs_cor.", ft)),
device = ft,
plot = fig3,
width = 9, height = 5, units = "in")
}
}
Figure 4
## combine model results ----
sl_summary_renamed <- sl_summary %>%
mutate(rmse = sqrt(unwt_mse)) %>%
rename(mse = unwt_mse)
combined_summary <- cv_logistic_summary %>% mutate(sl = FALSE) %>%
bind_rows(sl_summary_renamed %>% mutate(sl = TRUE))
## prepare dataframe -----
fig4_df <- combined_summary %>%
# give covariates clean names for legend & set order
mutate(covariates_label = case_when(
covariates == "pcr_only" & is.na(matern_var) & !sl ~ "PCR",
covariates == "clin_only" & is.na(matern_var) & !sl ~ "TF/TI",
covariates == "sero_only" & is.na(matern_var) & !sl ~ "Serology",
covariates == "sero_only" & matern_var == "Matern(1| lat + lon)" & !sl ~ "Serology + Matern",
covariates == "glmnet_vars_2" & is.na(matern_var) & !sl ~ "Geospatial features*",
covariates == "glmnet_vars_2" & matern_var == "Matern(1| lat + lon)" & !sl ~ "Geospatial + Matern",
covariates == "glmnet_and_trach_2" & matern_var == "Matern(1| lat + lon)" & !sl ~ "All trachoma + geospatial + Matern",
covariates == "glmnet_and_trach_2" & sl & sl_method == "spaMM-matern" ~ "All trachoma + geospatial + Matern, ensemble")) %>%
mutate(covariates_label = factor(covariates_label,
levels = c("PCR", "TF/TI",
"Serology", "Serology + Matern",
"Geospatial features*", "Geospatial + Matern",
"All trachoma + geospatial + Matern",
"All trachoma + geospatial + Matern, ensemble"))) %>%
# remove all unnamed covariate groups
filter(!is.na(covariates_label)) %>%
# add lag label & set order of facets
mutate(lag_label = case_when(
input_lag == 0 ~ "Concurrent predictions",
input_lag == 12 ~ "Forward predictions, 12 months",
input_lag == 24 ~ "Forward predictions, 24 months",
input_lag == 36 ~ "Forward predictions, 36 months")) %>%
mutate(lag_label = factor(lag_label, levels = c("Forward predictions, 36 months",
"Forward predictions, 24 months",
"Forward predictions, 12 months",
"Concurrent predictions"))) %>%
mutate_at(vars(r2, lower.ci, upper.ci, rmse), as.numeric)
## write function to create figure -----
get_r2_rmse_fig <- function(input_survey, input_folds_name){
ret <- fig4_df %>%
filter(outcome_survey == input_survey, folds_name == input_folds_name) %>%
ggplot() +
geom_point(aes(x = lag_label, y = r2, color = covariates_label), position = position_dodge(width = 1)) +
geom_errorbar(aes(x = lag_label, y = r2, color = covariates_label, ymin = lower.ci, ymax = upper.ci),
position = position_dodge(width = 1), width = 0) +
geom_hline(yintercept = 0, color = "black", lty = "dotted") +
# add text labels with RMSE
geom_label(aes(x = lag_label, y = -0.45, label = sprintf(rmse, fmt = "%0.2f"), color = covariates_label),
position = position_dodge(width = 1), size = 2, alpha = 0.5, label.padding = unit(0.1, "lines"),
show.legend = FALSE) +
scale_color_manual(values = c(RColorBrewer::brewer.pal(n = 8, name = "Dark2"), "black")) +
coord_cartesian(ylim = c(-0.5,1)) + # limit display of y-values
facet_wrap(.~lag_label, nrow = 1, scales = "free_x") +
labs(y = expression(paste("Cross-validated ", R^2)), color = "Model specification",
caption = "*Includes 12-month precipitation and night light radiance as selected by LASSO") +
theme_classic() +
theme(legend.position = "bottom",
axis.ticks.x = element_blank(),
axis.text.x = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_text(size = 12),
panel.grid.major = element_line(color = "grey95"),
legend.text = element_text(size = 7),
legend.title = element_text(size = 8),
legend.key.height = unit(0.5, 'cm'),
plot.caption = element_text(hjust = 0)) + # align caption to left
guides(colour = guide_legend(nrow = 2, byrow = TRUE))
return(ret)
}
## save figure -----
fig4 <- get_r2_rmse_fig(input_survey = 36, input_folds_name = "block_folds_list_15")
fig4
if(save_figs){
for (ft in file_types){
ggsave(filename = here(output_path, paste0("fig4_r2_rmse.", ft)),
device = ft,
plot = fig4,
width = 9.5, height = 3, units = "in")
}
}
Figure 5
## select CV results for figure -----
cv_results <- lapply(
1:length(cv_logistic_results),
function(x){
data.frame(index = cv_logistic_results[[x]]$index,
obs = cv_logistic_results[[x]]$obs,
pred = cv_logistic_results[[x]]$preds,
formula = cv_logistic_results[[x]]$formula) %>%
bind_cols(cv_logistic_summary %>%
filter(param_index == x) %>%
dplyr::select(param_index, covariates, matern_var,
folds_name, outcome_var,
outcome_survey, input_lag))}) %>%
bind_rows() %>%
bind_rows(sl_results)
cv_results_res <- cv_results %>%
filter(outcome_var == "prevalence_pcr",
folds_name == "block_folds_list_15")
## Function `get_ord_pcr_counts`: calculate cumulative proportion of PCR infections (at a given survey) captured by ordered villages -----
# input_df (dataframe): name of input dataframe
# pcr_survey (numeric): survey to use for pcr rankings
# order_var: order variable used to rank villages and count cumulative pcr infections
# lag (numeric): lag between order variable measurement and pcr
## -------------------------------------------------------------------------------
get_ord_pcr_counts <- function(input_df){
temp_df <- input_df %>%
mutate(obs_pos30 = obs*30)
total_pos_pcr <- sum(temp_df %>% pull(obs_pos30))
# use observed values directly for PCR
if(input_df$input_lag[1] == 0 & input_df$covariates[1] == "pcr_only"){
ret <- temp_df %>%
arrange(desc(obs_pos30)) %>%
ungroup() %>%
mutate(ord_index = row_number(),
ord_prop = cumsum(obs_pos30)/total_pos_pcr)
} else {
ret <- temp_df %>%
arrange(desc(pred)) %>%
ungroup() %>%
mutate(ord_index = row_number(),
ord_prop = cumsum(obs_pos30)/total_pos_pcr)
}
return(ret)
}
ord_pcr_grid <- expand.grid(outcome_survey = c(0, 12, 24, 36), input_lag = c(0, 12, 24, 36)) %>%
filter(input_lag == 0 | outcome_survey == 36) %>%
crossing(data.frame(covariates = c("pcr_only", "clin_only", "sero_only", "glmnet_and_trach_2"),
matern_var = c(NA, NA, NA, NA),
sl_method = c(NA, NA, NA, "spaMM-matern"))) %>%
as.data.frame()
ord_pcr_counts <- lapply(
1:nrow(ord_pcr_grid),
function(x){
temp_df <- cv_results_res %>% filter(covariates == ord_pcr_grid[x, "covariates"],
outcome_survey == ord_pcr_grid[x, "outcome_survey"],
input_lag == ord_pcr_grid[x, "input_lag"])
if(is.na(ord_pcr_grid[x, "matern_var"])){
temp_df <- temp_df %>% filter(is.na(matern_var))
} else {
temp_df <- temp_df %>% filter(matern_var == ord_pcr_grid[x, "matern_var"])
}
if(is.na(ord_pcr_grid[x, "sl_method"])){
temp_df <- temp_df %>% filter(is.na(sl_method))
} else {
temp_df <- temp_df %>% filter(sl_method == ord_pcr_grid[x, "sl_method"])
}
get_ord_pcr_counts(temp_df)}) %>%
bind_rows()
## randomly permute villages to get a "null" distribution -----
# df (dataframe): dataframe with variable to count and survey labels
# pcr_survey (numeric): survey of interest
# pcr_var (character): counting variable
# i (numeric): bootstrap iteration
clu_random_0to5_fig5 <- clu_random_0to5 %>% mutate(n_pos_30_pcr = prevalence_pcr * 30)
get_random_ord <- function(df, pcr_survey, pcr_var, i){
df_pcr <- df %>%
filter(survey == pcr_survey) %>%
dplyr::select(cluster_id, all_of(pcr_var))
total_pos_pcr <- sum(df_pcr %>% pull(pcr_var))
ret <- df_pcr %>%
arrange(sample(1:40, replace = FALSE)) %>%
ungroup() %>%
mutate(ord_prop = cumsum(get(pcr_var))/total_pos_pcr) %>%
pull(ord_prop)
return(setNames(list(ret), paste0("bs_",i))) # name added to prevent warning when binding cols
}
set.seed(111)
random_ord_npos30 <- lapply(c(0, 12, 24, 36),
function(s){
lapply(c(1:1000), function(x) get_random_ord(df = clu_random_0to5_fig5, pcr_survey = s, pcr_var = "n_pos_30_pcr", i = x)) %>%
bind_cols() %>%
ungroup() %>%
mutate(pcr_survey = s, ord_index = row_number(), pcr_var = "n_pos_30_pcr")}) %>%
bind_rows()
# get empirical 95% distribution
random_ord_npos30$q0.025 <- apply(random_ord_npos30 %>% dplyr::select(-c(pcr_survey, ord_index, pcr_var)), 1, quantile, probs = 0.025)
random_ord_npos30$q0.975 <- apply(random_ord_npos30 %>% dplyr::select(-c(pcr_survey, ord_index, pcr_var)), 1, quantile, probs = 0.975)
random_ord <- random_ord_npos30 %>% dplyr::select(pcr_survey, ord_index, pcr_var, q0.025, q0.975) %>% mutate(outcome_survey_label = paste0("Month ", pcr_survey))
## prep dataframe for figure -----
fig5_df <- ord_pcr_counts %>%
filter(outcome_survey == 36) %>%
mutate(order_var_label = case_when(
covariates == "clin_only" ~ "TF/TI",
covariates == "sero_only" ~ "Serology",
covariates == "pcr_only" & input_lag != 0 ~ "PCR",
covariates == "glmnet_and_trach_2" ~ "All trachoma + geospatial \n+ Matern, ensemble")) %>%
filter(!is.na(order_var_label)) %>%
# add additional rows to represent fixed optimal PCR ordering at Month 36
bind_rows(ord_pcr_counts %>% filter(outcome_survey == 36, input_lag == 0, covariates == "pcr_only") %>%
mutate(order_var_label = "PCR (observed)") %>%
dplyr::select(-input_lag) %>% crossing(input_lag = survey_list) %>% arrange(input_lag, ord_index)) %>%
mutate(order_var_label = factor(order_var_label, levels = c("PCR (observed)", "PCR", "Serology", "TF/TI", "All trachoma + geospatial \n+ Matern, ensemble"))) %>%
mutate(outcome_survey_label = paste0("Month ", outcome_survey)) %>%
mutate(lag_label = case_when(
input_lag == 0 ~ "Concurrent indicators",
input_lag == 12 ~ "Forward ranking, 12 months",
input_lag == 24 ~ "Forward ranking, 24 months",
input_lag == 36 ~ "Forward ranking, 36 months")) %>%
mutate(lag_label = factor(lag_label,
levels = c("Forward ranking, 36 months", "Forward ranking, 24 months",
"Forward ranking, 12 months", "Concurrent indicators"))) %>%
# to prepare for adding vertical lines to plots
mutate(over80 = ord_prop >= 0.8)
fig5_over80 <- fig5_df %>%
filter(over80) %>%
group_by(order_var_label, outcome_survey, input_lag, lag_label) %>%
filter(row_number() == 1) %>%
ungroup() %>%
mutate(ord_index = as.numeric(ord_index))
## manual jittering for overlapping lines -----
fig5_jitter_val <- 0.2
fig5_over80[which(fig5_over80$input_lag == 0 & fig5_over80$order_var_label == "Serology"), "ord_index"] <- fig5_over80[which(fig5_over80$input_lag == 0 & fig5_over80$order_var_label == "Serology"), "ord_index"] - fig5_jitter_val
fig5_over80[which(fig5_over80$input_lag == 0 & fig5_over80$order_var_label == "All trachoma + geospatial \n+ Matern, ensemble"), "ord_index"] <- fig5_over80[which(fig5_over80$input_lag == 0 & fig5_over80$order_var_label == "All trachoma + geospatial \n+ Matern, ensemble"), "ord_index"] + fig5_jitter_val
fig5_over80[which(fig5_over80$input_lag == 24 & fig5_over80$order_var_label == "PCR"), "ord_index"] <- fig5_over80[which(fig5_over80$input_lag == 24 & fig5_over80$order_var_label == "PCR"), "ord_index"] - fig5_jitter_val
fig5_over80[which(fig5_over80$input_lag == 24 & fig5_over80$order_var_label == "All trachoma + geospatial \n+ Matern, ensemble"), "ord_index"] <- fig5_over80[which(fig5_over80$input_lag == 24 & fig5_over80$order_var_label == "All trachoma + geospatial \n+ Matern, ensemble"), "ord_index"] + fig5_jitter_val
## plot figure -----
trach_ind_colors <- c("PCR (observed)" = "black",
"PCR" = RColorBrewer::brewer.pal(n = 8, name = "Dark2")[1],
"TF/TI" = RColorBrewer::brewer.pal(n = 8, name = "Dark2")[2],
"Serology" = RColorBrewer::brewer.pal(n = 8, name = "Dark2")[3],
"All trachoma + geospatial \n+ Matern, ensemble" = RColorBrewer::brewer.pal(n = 8, name = "Dark2")[8])
fig5 <- ggplot() +
geom_line(data = fig5_df, aes(x = ord_index, y = ord_prop, color = order_var_label)) +
geom_ribbon(data = random_ord %>% filter(pcr_survey == 36),
aes(x = ord_index, ymin = q0.025, ymax = q0.975), fill = "grey", alpha = 0.3) +
geom_segment(data = fig5_over80, aes(x = ord_index, xend = ord_index,
y = -0.1, yend = ord_prop, color = order_var_label),
lty = "dashed", alpha = 0.75) +
geom_point(data = fig5_over80, aes(x = ord_index, y = ord_prop, color = order_var_label), size = 0.9) +
geom_abline(slope = 1/40, intercept = 0, color = "darkgrey", lty = "dotted") +
scale_color_manual(values = trach_ind_colors) +
scale_y_continuous(breaks = seq(0,1,by = 0.2)) +
coord_cartesian(ylim = c(0,1)) +
facet_wrap(.~lag_label, nrow = 1) +
labs(x = "Communities ranked by predictions from given model specification",
y = str_wrap("Cumulative proportion of month 36 PCR infections identified", width = 35),
color = "Model specification") +
theme_classic() +
theme(legend.position = "bottom",
legend.text = element_text(size = 7),
legend.title = element_text(size = 8))
fig5
if(save_figs){
for (ft in file_types){
ggsave(filename = here(output_path, paste0("fig5_cum_pcr_m36.", ft)),
device = ft,
plot = fig5,
width = 9.5, height = 3.2, units = "in")
}
}